###############################################################################
# Setup and Libraries and Read in Data ----------------------------------------

library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(tidyr)
library(purrr)
library(tibble)
library(lubridate)


df = read_rds("data/analysis/plos_cleaned_data.rds")
###############################################################################
# Function Definitions  ----------------------------------------

#' Create a visualization of data availability statement types
#' @param df A data frame containing data availability flags
#' @return A ggplot object showing proportions of different data availability types
#' @import ggplot2
#' @import tidyr
visualize_data_availability = function(df) {
  df |>
    summarise(
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      `Other availability` = mean(data_other, na.rm = TRUE),
    ) |>
    pivot_longer(
      everything(),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    ggplot(aes(x = reorder(availability_type, -proportion), y = proportion)) +
    geom_text(
      aes(label = scales::percent(proportion, accuracy = 0.1)),
      vjust = -0.5
    ) +
    geom_col(fill = "steelblue") +
    scale_y_continuous(labels = scales::percent) +
    labs(
      x = "Data Availability Type",
      y = "Proportion of Articles",
      title = "Data Availability Patterns in Qualitative PLOS Articles",
      caption = sprintf("Based on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}


#' Create a visualization of data availability statement types
#' @param df A data frame containing data availability flags
#' @return A ggplot object showing proportions of different data availability types
#' @import ggplot2
#' @import tidyr
visualize_data_availability = function(df) {
  df |>
    summarise(
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE)
    ) |>
    pivot_longer(
      everything(),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    ggplot(aes(x = reorder(availability_type, -proportion), y = proportion)) +
    geom_text(
      aes(label = scales::percent(proportion, accuracy = 0.1)),
      vjust = -0.5
    ) +
    geom_col(fill = "steelblue") +
    scale_y_continuous(labels = scales::percent) +
    labs(
      x = "Data Availability Type",
      y = "Proportion of Articles",
      title = "Data Availability Patterns in Qualitative PLOS Articles",
      caption = sprintf("Based on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}



#' Create a summary table of data availability percentages
#' @param df A data frame containing data availability flags
#' @return A tibble with counts and percentages for each data availability type
summarize_data_availability = function(df) {
  df |>
    summarise(
      `Cannot Share` = sum(data_cannot_share, na.rm = TRUE),
      `Available on Request` = sum(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = sum(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = sum(data_in_repository, na.rm = TRUE),
      total = n()
    ) |>
    pivot_longer(
      -total,
      names_to = "availability_type",
      values_to = "n"
    ) |>
    mutate(
      pct = n / total,
      pct_formatted = scales::percent(pct, accuracy = 0.1)
    ) |>
    select(availability_type, n, pct_formatted) |>
    arrange(desc(n))
}


#' Check and visualize detailed data availability patterns
#' @param df A data frame containing detailed data availability flags
#' @return A ggplot object showing proportions of each availability type
#' @import ggplot2
visualize_data_available_details = function(df) {
  # Check for logical consistency
  inconsistent = df |>
    filter(
      !data_in_repository & 
        (data_not_available | data_available_documentation | 
           data_available_quant | data_available_partial_qual | 
           data_available_full)
    )
  
  if (nrow(inconsistent) > 0) {
    warning(sprintf("Found %d rows with data availability flags where data_in_repository is FALSE", 
                    nrow(inconsistent)))
    # Print problematic rows
    print(inconsistent |>
            select(article_title, url, 
                   data_in_repository, data_not_available, 
                   data_available_documentation, data_available_quant,
                   data_available_partial_qual, data_available_full))
  }
  
  # Create visualization using only repository cases
  df |>
    filter(data_in_repository) |>  # Only look at repository cases
    summarise(
      `Not Available` = mean(data_not_available, na.rm = TRUE),
      `Documentation Available` = mean(data_available_documentation, na.rm = TRUE),
      `Quantitative Data Available` = mean(data_available_quant, na.rm = TRUE),
      `Partial Qualitative Data Available` = mean(data_available_partial_qual, na.rm = TRUE),
      `Full Data Available` = mean(data_available_full, na.rm = TRUE)
    ) |>
    pivot_longer(
      everything(),
      names_to = "type",
      values_to = "proportion"
    ) |>
    arrange(desc(proportion)) |>
    mutate(
      type = factor(type, levels = type)  # Preserve sorting order
    ) |>
    ggplot(aes(x = type, y = proportion)) +
    geom_col(fill = "steelblue") +
    geom_text(
      aes(label = scales::percent(proportion, accuracy = 0.1)),
      vjust = -0.5
    ) +
    scale_y_continuous(
      labels = scales::percent,
      limits = c(0, max(df |>
                          filter(data_in_repository) |>
                          select(data_not_available, data_available_documentation,
                                 data_available_quant, data_available_partial_qual,
                                 data_available_full) |>
                          summarise(across(everything(), \(x) mean(x, na.rm = TRUE))) |>
                          unlist()) * 1.2)  # Add 20% padding for labels
    ) +
    labs(
      x = "Type of Data Availability",
      y = "Proportion of Repository Data",
      title = "Detailed Data Availability Patterns",
      subtitle = sprintf("For articles with data in repository (n=%d)", 
                         sum(df$data_in_repository, na.rm = TRUE))
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1)
    )
}


#' Check consistency of on_request variables
#' @param df A data frame containing data availability flags
#' @return A logical value indicating if all on_request subtypes are consistent
check_request_consistency = function(df) {
  inconsistencies = df |>
    filter(
      !data_on_request & (
        on_request_authors |
          on_request_IRB_ethics_board |
          on_request_original_source |
          on_request_author_institution
      )
    )
  
  if (nrow(inconsistencies) > 0) {
    warning(sprintf("Found %d rows where on_request subtypes are TRUE but data_on_request is FALSE", 
                    nrow(inconsistencies)))
    return(FALSE)
  }
  return(TRUE)
}

#' Create a summary table of on-request subtypes
#' @param df A data frame containing data availability flags
#' @return A tibble with counts and percentages for each on-request type
summarize_request_types = function(df) {
  df |>
    filter(data_on_request) |>  # Only look at cases where data is available on request
    summarise(
      `From Authors` = sum(on_request_authors, na.rm = TRUE),
      `From IRB/Ethics Board` = sum(on_request_IRB_ethics_board, na.rm = TRUE),
      `From Original Source` = sum(on_request_original_source, na.rm = TRUE),
      `From Author Institution` = sum(on_request_author_institution, na.rm = TRUE),
      total = n()
    ) |>
    pivot_longer(
      -total,
      names_to = "request_type",
      values_to = "n"
    ) |>
    mutate(
      pct = n / total,
      pct_formatted = scales::percent(pct, accuracy = 0.1)
    ) |>
    select(request_type, n, pct_formatted) |>
    arrange(desc(n))
}

#' Create a visualization of on-request subtypes
#' @param df A data frame containing data availability flags
#' @return A ggplot object showing proportions of different on-request types
#' @import ggplot2
visualize_request_types = function(df) {
  df |>
    filter(data_on_request) |>  # Only look at cases where data is available on request
    summarise(
      `From Authors` = mean(on_request_authors, na.rm = TRUE),
      `From IRB/Ethics Board` = mean(on_request_IRB_ethics_board, na.rm = TRUE),
      `From Original Source` = mean(on_request_original_source, na.rm = TRUE),
      `From Author Institution` = mean(on_request_author_institution, na.rm = TRUE)
    ) |>
    pivot_longer(
      everything(),
      names_to = "request_type",
      values_to = "proportion"
    ) |>
    ggplot(aes(x = reorder(request_type, -proportion), y = proportion)) +
    geom_text(
      aes(label = scales::percent(proportion, accuracy = 0.1)),
      vjust = -0.5
    ) +
    geom_col(fill = "steelblue") +
    scale_y_continuous(labels = scales::percent) +
    labs(
      x = "Request Type",
      y = "Proportion of On-Request Articles",
      title = "Types of Data Available on Request",
      caption = sprintf("Based on articles with data available on request (n=%d)", 
                        sum(df$data_on_request, na.rm = TRUE))
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}


#' Create a heatmap of data availability patterns by method or type
#' @param df A data frame containing data availability and method/type flags
#' @param var_prefix String indicating which set of variables to analyze ("method_" or "type_")
#' @return A ggplot object showing the relationship between methods/types and data availability
#' @import ggplot2
visualize_availability_patterns = function(df, var_prefix = "method_") {
  
  avail_order = c("Available in Paper and/or SI", "Available on Request", 
                  "Cannot Share", "Available in Repository")
  # Get relevant columns, now including 'other'
  pattern_cols = names(df)[str_detect(names(df), paste0("^", var_prefix))]
  
  # Calculate chi-square tests for significance
  sig_tests = map_dfr(pattern_cols, \(pattern) {
    map_dfr(c("data_cannot_share", "data_on_request", 
              "data_in_paper_andor_SI", "data_in_repository"), \(avail) {
                test = chisq.test(df[[pattern]], df[[avail]])
                tibble(
                  pattern = str_remove(pattern, var_prefix),
                  availability_type = case_when(
                    avail == "data_cannot_share" ~ "Cannot Share",
                    avail == "data_on_request" ~ "Available on Request",
                    avail == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
                    avail == "data_in_repository" ~ "Available in Repository"
                  ),
                  p_value = test$p.value
                )
              })
  })
  
  # Prepare the data
  plot_data = df |>
    pivot_longer(
      all_of(pattern_cols),
      names_to = "pattern",
      values_to = "has_pattern"
    ) |>
    mutate(pattern = str_remove(pattern, var_prefix)) |>
    filter(has_pattern) |>
    group_by(pattern) |>
    summarise(
      n_total = n(),
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      .groups = "drop"
    ) |>
    # Sort here before reshaping
    arrange(desc(n_total)) |>
    pivot_longer(
      -c(pattern, n_total),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    left_join(sig_tests, by = c("pattern", "availability_type")) |>
    mutate(
      sig_stars = case_when(
        p_value < 0.001 ~ "***",
        p_value < 0.01 ~ "**",
        p_value < 0.05 ~ "*",
        TRUE ~ ""
      )
    )
  
  # Get pattern order from the sorted data
  pattern_order = unique(plot_data$pattern)
  
  # Create pattern labels preserving the order
  pattern_labels = plot_data |>
    select(pattern, n_total) |>
    distinct() |>
    arrange(match(pattern, pattern_order)) |>  # Maintain the ordering
    mutate(pattern_label = str_c(pattern, "\n(n=", n_total, ")")) |>
    pull(pattern_label)
  
  # Create the plot
  ggplot(plot_data, aes(x = factor(availability_type, levels = avail_order), 
                        y = factor(pattern, levels = rev(pattern_order)))) +  # Reverse to get highest at top
    geom_tile(aes(fill = proportion)) +
    geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
              size = 3) +
    geom_text(aes(label = sig_stars),
              size = 3,
              vjust = -1) +
    scale_fill_gradient2(
      low = "white",
      high = "steelblue",
      labels = scales::percent
    ) +
    scale_y_discrete(labels = rev(pattern_labels)) +  # Reverse labels to match
    labs(
      x = "Data Availability Type",
      y = if_else(var_prefix == "method_", "Method", "Data Type"),
      fill = "Proportion",
      title = sprintf("Data Availability Patterns by %s", 
                      if_else(var_prefix == "method_", "Method", "Data Type")),
      caption = sprintf("* p < 0.05, ** p < 0.01, *** p < 0.001\nBased on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.text.y = element_text(lineheight = 0.8)
    )
}



#' Test association between method/type and data availability
#' @param df A data frame containing data availability and method/type flags
#' @param var_prefix String indicating which set of variables to analyze ("method_" or "type_")
#' @return A list of chi-square test results for each availability type
test_availability_associations = function(df, var_prefix = "method_") {
  pattern_cols = names(df)[str_detect(names(df), paste0("^", var_prefix))]
  
  availability_vars = c("data_cannot_share", "data_on_request", 
                        "data_in_paper_andor_SI", "data_in_repository")
  
  map(pattern_cols, \(pattern) {
    map(availability_vars, \(avail) {
      test = chisq.test(df[[pattern]], df[[avail]])
      list(
        pattern = pattern,
        availability = avail,
        p_value = test$p.value,
        chi_square = test$statistic
      )
    })
  }) |>
    list_flatten() |>
    bind_rows()
}



#' Plot trends in data availability over time
#' @param df A data frame containing publication dates and data availability flags 
#' @param by_month Logical indicating whether to group by month (TRUE) or year (FALSE)
#' @return A ggplot object showing availability patterns over time
#' @import ggplot2
#' @import lubridate
visualize_availability_trends = function(df, by_month = FALSE) {
  avail_order = c("Available in Paper and/or SI", "Available on Request", 
                  "Cannot Share", "Available in Repository")
  
  plot_data = df |>
    # Convert dates
    mutate(
      pub_date = as.Date(pub_date, format = "%m/%d/%y"),
      time_period = if(by_month) 
        lubridate::floor_date(pub_date, "month") 
      else 
        lubridate::floor_date(pub_date, "year")
    ) |>
    # Calculate proportions for each time period
    group_by(time_period) |>
    summarise(
      n = n(),
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      .groups = "drop"
    ) |>
    # Reshape for plotting
    pivot_longer(
      -c(time_period, n),
      names_to = "availability_type",
      values_to = "proportion"
    )
  
  # Create plot
  ggplot(plot_data, 
         aes(x = time_period, y = proportion, color = factor(availability_type, levels = avail_order))) +
    geom_line() +
    geom_point(aes(size = n)) +
    scale_y_continuous(labels = scales::percent) +
    scale_size_continuous(range = c(1, 5)) +
    labs(
      x = if(by_month) "Publication Month" else "Publication Year",
      y = "Proportion of Articles",
      color = "Data Availability Type",
      size = "Number of\nArticles",
      title = "Trends in Data Availability Over Time",
      caption = sprintf("Based on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(
      legend.position = "right",
      axis.text.x = element_text(angle = 45, hjust = 1)
    )
}



#' Create a heatmap of data availability patterns by continent
#' @param df A data frame containing first author continent and data availability flags
#' @return A ggplot object showing the relationship between continent and data availability
#' @import ggplot2 
visualize_continent_availability = function(df) {
  # Calculate chi-square tests for significance
  sig_tests = map_dfr(c("data_cannot_share", "data_on_request", 
                        "data_in_paper_andor_SI", "data_in_repository"), \(avail) {
                          test = chisq.test(df$first_author_continent, df[[avail]])
                          map_dfr(unique(df$first_author_continent), \(cont) {
                            observed = table(df$first_author_continent == cont, df[[avail]])
                            expected = chisq.test(observed)$expected
                            residual = (observed - expected) / sqrt(expected)
                            
                            tibble(
                              pattern = cont,
                              availability_type = case_when(
                                avail == "data_cannot_share" ~ "Cannot Share",
                                avail == "data_on_request" ~ "Available on Request",
                                avail == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
                                avail == "data_in_repository" ~ "Available in Repository"
                              ),
                              p_value = test$p.value
                            )
                          })
                        })
  
  # Prepare the data
  plot_data = df |>
    group_by(first_author_continent) |>
    summarise(
      n_total = n(),
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      .groups = "drop"
    ) |>
    pivot_longer(
      -c(first_author_continent, n_total),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    # Add significance markers
    left_join(
      sig_tests, 
      by = c("first_author_continent" = "pattern", "availability_type")
    ) |>
    mutate(
      sig_stars = case_when(
        p_value < 0.001 ~ "***",
        p_value < 0.01 ~ "**",
        p_value < 0.05 ~ "*",
        TRUE ~ ""
      )
    )
  
  # Get continent order by frequency
  continent_order = plot_data |>
    select(first_author_continent, n_total) |>
    distinct() |>
    arrange(desc(n_total)) |>
    pull(first_author_continent)
  
  # Create continent labels with counts
  continent_labels = plot_data |>
    select(first_author_continent, n_total) |>
    distinct() |>
    arrange(match(first_author_continent, continent_order)) |>
    mutate(continent_label = str_c(first_author_continent, "\n(n=", n_total, ")")) |>
    pull(continent_label)
  
  # Create the plot
  ggplot(plot_data, 
         aes(x = availability_type, 
             y = factor(first_author_continent, levels = rev(continent_order)))) +
    geom_tile(aes(fill = proportion)) +
    geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
              size = 3) +
    geom_text(aes(label = sig_stars),
              size = 3,
              vjust = -1) +
    scale_fill_gradient2(
      low = "white",
      high = "steelblue",
      labels = scales::percent
    ) +
    scale_y_discrete(labels = rev(continent_labels)) +
    labs(
      x = "Data Availability Type",
      y = "First Author Continent",
      fill = "Proportion",
      title = "Data Availability Patterns by First Author Continent",
      caption = sprintf("* p < 0.05, ** p < 0.01, *** p < 0.001\nBased on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.text.y = element_text(lineheight = 0.8)
    )
}


#' Create a heatmap of data availability patterns by top countries
#' @param df A data frame containing first author country and data availability flags
#' @param n_countries Number of top countries to show separately (default 5)
#' @return A ggplot object showing the relationship between country and data availability
#' @import ggplot2 
visualize_country_availability = function(df, n_countries = 5) {
  # First identify top countries explicitly
  top_countries = df |>
    count(first_author_country) |>
    arrange(desc(n)) |>
    slice_head(n = n_countries) |>
    pull(first_author_country)
  
  # Create modified dataset with grouped countries
  df_modified = df |>
    mutate(
      country_grouped = if_else(
        first_author_country %in% top_countries,
        first_author_country,
        "Other"
      )
    )
  
  # Calculate chi-square tests for significance
  sig_tests = map_dfr(c("data_cannot_share", "data_on_request", 
                        "data_in_paper_andor_SI", "data_in_repository"), \(avail) {
                          test = chisq.test(df_modified$country_grouped, df_modified[[avail]])
                          map_dfr(unique(df_modified$country_grouped), \(country) {
                            tibble(
                              pattern = country,
                              availability_type = case_when(
                                avail == "data_cannot_share" ~ "Cannot Share",
                                avail == "data_on_request" ~ "Available on Request",
                                avail == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
                                avail == "data_in_repository" ~ "Available in Repository"
                              ),
                              p_value = test$p.value
                            )
                          })
                        })
  
  # Prepare the data
  plot_data = df_modified |>
    group_by(country_grouped) |>
    summarise(
      n_total = n(),
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      .groups = "drop"
    ) |>
    pivot_longer(
      -c(country_grouped, n_total),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    # Add significance markers
    left_join(
      sig_tests, 
      by = c("country_grouped" = "pattern", "availability_type")
    ) |>
    mutate(
      sig_stars = case_when(
        p_value < 0.001 ~ "***",
        p_value < 0.01 ~ "**",
        p_value < 0.05 ~ "*",
        TRUE ~ ""
      )
    )
  
  # Get country order by frequency (ensuring Other is last)
  country_order = plot_data |>
    select(country_grouped, n_total) |>
    distinct() |>
    arrange(country_grouped == "Other", desc(n_total)) |>
    pull(country_grouped)
  
  # Create country labels with counts
  country_labels = plot_data |>
    select(country_grouped, n_total) |>
    distinct() |>
    arrange(match(country_grouped, country_order)) |>
    mutate(country_label = str_c(country_grouped, "\n(n=", n_total, ")")) |>
    pull(country_label)
  
  # Create the plot
  ggplot(plot_data, 
         aes(x = availability_type, 
             y = factor(country_grouped, levels = rev(country_order)))) +
    geom_tile(aes(fill = proportion)) +
    geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
              size = 3) +
    geom_text(aes(label = sig_stars),
              size = 3,
              vjust = -1) +
    scale_fill_gradient2(
      low = "white",
      high = "steelblue",
      labels = scales::percent
    ) +
    scale_y_discrete(labels = rev(country_labels)) +
    labs(
      x = "Data Availability Type",
      y = "First Author Country",
      fill = "Proportion",
      title = "Data Availability Patterns by First Author Country",
      subtitle = sprintf("Top %d countries shown, remaining grouped as 'Other'", n_countries),
      caption = sprintf("* p < 0.05, ** p < 0.01, *** p < 0.001\nBased on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.text.y = element_text(lineheight = 0.8)
    )
}



#' Create a heatmap of data availability patterns by top countries and continental regions
#' @param df A data frame containing first author country and continent
#' @param n_countries Number of top countries to show separately (default 5)
#' @return A ggplot object showing the relationship between country/region and data availability
#' @import ggplot2 
visualize_country_continent_availability = function(df, n_countries = 5) {
  
  avail_order = c("Available in Paper and/or SI", "Available on Request", 
                  "Cannot Share", "Available in Repository")
  # First identify top countries
  top_countries = df |>
    count(first_author_country) |>
    arrange(desc(n)) |>
    slice_head(n = n_countries) |>
    pull(first_author_country)
  
  # Create modified dataset with grouped countries by continent
  df_modified = df |>
    mutate(
      country_grouped = if_else(
        first_author_country %in% top_countries,
        first_author_country,
        paste("Other -", first_author_continent)
      )
    )
  
  sig_tests = map_dfr(unique(df_modified$country_grouped), \(country) {
    map_dfr(c("data_cannot_share", "data_on_request", 
              "data_in_paper_andor_SI", "data_in_repository"), \(avail) {
                # Create binary variable for this country
                country_binary = df_modified$country_grouped == country
                test = chisq.test(country_binary, df_modified[[avail]])
                tibble(
                  pattern = country,
                  availability_type = case_when(
                    avail == "data_cannot_share" ~ "Cannot Share",
                    avail == "data_on_request" ~ "Available on Request",
                    avail == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
                    avail == "data_in_repository" ~ "Available in Repository"
                  ),
                  p_value = test$p.value
                )
              })
  })
  
  # Prepare the data
  plot_data = df_modified |>
    group_by(country_grouped) |>
    summarise(
      n_total = n(),
      `Cannot Share` = mean(data_cannot_share, na.rm = TRUE),
      `Available on Request` = mean(data_on_request, na.rm = TRUE),
      `Available in Paper and/or SI` = mean(data_in_paper_andor_SI, na.rm = TRUE),
      `Available in Repository` = mean(data_in_repository, na.rm = TRUE),
      .groups = "drop"
    ) |>
    # Add a sorting helper
    mutate(
      is_other = str_detect(country_grouped, "^Other"),
      sort_group = if_else(is_other, 2, 1)
    ) |>
    pivot_longer(
      -c(country_grouped, n_total, is_other, sort_group),
      names_to = "availability_type",
      values_to = "proportion"
    ) |>
    # Add significance markers
    left_join(
      sig_tests, 
      by = c("country_grouped" = "pattern", "availability_type")
    ) |>
    mutate(
      sig_stars = case_when(
        p_value < 0.001 ~ "***",
        p_value < 0.01 ~ "**",
        p_value < 0.05 ~ "*",
        TRUE ~ ""
      )
    )
  
  # Get country order by frequency (top countries first, then Other by continent)
  country_order = plot_data |>
    select(country_grouped, n_total, sort_group) |>
    distinct() |>
    arrange(sort_group, desc(n_total)) |>
    pull(country_grouped)
  
  # Create country labels with counts
  country_labels = plot_data |>
    select(country_grouped, n_total) |>
    distinct() |>
    arrange(match(country_grouped, country_order)) |>
    mutate(country_label = str_c(country_grouped, "\n(n=", n_total, ")")) |>
    pull(country_label)
  
  # Create the plot
  ggplot(plot_data, 
         aes(x = factor(availability_type, levels = avail_order), 
             y = factor(country_grouped, levels = rev(country_order)))) +
    geom_tile(aes(fill = proportion)) +
    geom_text(aes(label = scales::percent(proportion, accuracy = 0.1)),
              size = 3) +
    geom_text(aes(label = sig_stars),
              size = 3,
              vjust = -1) +
    scale_fill_gradient2(
      low = "white",
      high = "steelblue",
      labels = scales::percent
    ) +
    scale_y_discrete(labels = rev(country_labels)) +
    labs(
      x = "Data Availability Type",
      y = "Country/Region",
      fill = "Proportion",
      title = "Data Availability Patterns by Country and Region",
      subtitle = sprintf("Top %d countries shown separately, others grouped by continent", n_countries),
      caption = sprintf("* p < 0.05, ** p < 0.01, *** p < 0.001\nBased on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      axis.text.y = element_text(lineheight = 0.8)
    )
}


#' Create side-by-side plots comparing data availability by funding status
#' @param df A data frame containing funding and data availability indicators
#' @return A ggplot object with proportion comparison
#' @import ggplot2
visualize_funding_availability = function(df) {
  # Prepare data
  plot_data = df |>
    summarise(
      across(
        c(data_cannot_share, data_on_request, data_in_paper_andor_SI, data_in_repository),
        \(x) mean(x[funding_binary], na.rm = TRUE) - mean(x[!funding_binary], na.rm = TRUE)
      )
    ) |>
    pivot_longer(
      everything(),
      names_to = "type",
      values_to = "difference"
    ) |>
    mutate(
      type = case_when(
        type == "data_cannot_share" ~ "Cannot Share",
        type == "data_on_request" ~ "Available on Request",
        type == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
        type == "data_in_repository" ~ "Available in Repository"
      )
    )
  
  # Create chi-square test results
  chi_tests = map_dfr(c("data_cannot_share", "data_on_request", 
                        "data_in_paper_andor_SI", "data_in_repository"), \(var) {
                          test = chisq.test(df[[var]], df$funding_binary)
                          tibble(
                            type = case_when(
                              var == "data_cannot_share" ~ "Cannot Share",
                              var == "data_on_request" ~ "Available on Request",
                              var == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
                              var == "data_in_repository" ~ "Available in Repository"
                            ),
                            sig_stars = case_when(
                              test$p.value < 0.001 ~ "***",
                              test$p.value < 0.01 ~ "**",
                              test$p.value < 0.05 ~ "*",
                              TRUE ~ ""
                            )
                          )
                        })
  
  # Join significance test results
  plot_data = plot_data |>
    left_join(chi_tests, by = "type")
  
  # Create the difference plot
  ggplot(plot_data, aes(x = type, y = difference)) +
    geom_col(fill = "steelblue") +
    geom_text(aes(label = sig_stars),
              vjust = if_else(plot_data$difference >= 0, -0.5, 1.5)) +
    scale_y_continuous(labels = scales::percent) +
    labs(
      x = "Data Availability Type",
      y = "Difference in Proportion (Funded - Unfunded)",
      title = "Difference in Data Availability by Funding Status",
      caption = sprintf("* p < 0.05, ** p < 0.01, *** p < 0.001\nBased on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

#' Create a faceted proportion plot for data availability by funding
#' @param df A data frame containing funding and data availability indicators
#' @return A ggplot object with proportions by funding status
visualize_funding_proportions = function(df) {
  avail_order = c("Available in Paper and/or SI", "Available on Request", 
                  "Cannot Share", "Available in Repository")
  
  df |>
    pivot_longer(
      c(data_on_request, data_in_paper_andor_SI, data_in_repository, data_cannot_share),
      names_to = "type",
      values_to = "value"
    ) |>
    mutate(
      type = case_when(
        type == "data_cannot_share" ~ "Cannot Share",
        type == "data_on_request" ~ "Available on Request",
        type == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
        type == "data_in_repository" ~ "Available in Repository"
      ),
      funding = if_else(funding_binary, "Funded", "Not Funded")
    ) |>
    group_by(type, funding) |>
    summarise(
      prop = mean(value, na.rm = TRUE),
      n_total = n(),
      n_yes = sum(value, na.rm = TRUE),  # Count of "yes" responses
      .groups = "drop"
    ) |>
    # mutate(
    #   # Calculate standard error for proportion
    #   se = sqrt(prop * (1 - prop) / n_total),
    #   # 95% CI using normal approximation
    #   ci_lower = pmax(0, prop - 1.96 * se),
    #   ci_upper = pmin(1, prop + 1.96 * se)
    # ) |>
    ggplot(aes(x = funding, y = prop, fill = funding)) +
    geom_col() +
    # geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), 
    #               width = 0.3) +
    geom_text(aes(label = paste0(scales::percent(prop, accuracy = 0.1), 
                                 " (n=", n_yes, ")")),
              vjust = -0.5)  +
    facet_wrap(~factor(type, levels = avail_order)) +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_brewer(palette = "Set2") +
    labs(
      x = "Funding Status",
      y = "Proportion",
      title = "Data Availability Proportions by Funding Status",
      caption = sprintf("Based on %d articles", nrow(df))
    ) +
    theme_minimal()
}


#' Visualize citation count distributions by data availability type
#' @param df A data frame containing cited_by_count and data availability indicators
#' @return A ggplot object showing citation distributions by data availability type
#' @import ggplot2
#' @import ggridges
visualize_citation_distributions = function(df) {
  # Define consistent order
  type_order = c( "Available in Repository", "Cannot Share", "Available on Request", "Available in Paper and/or SI")
  
  # Prepare data in long format
  plot_data = df |>
    pivot_longer(
      c(data_cannot_share, data_on_request, data_in_paper_andor_SI, data_in_repository),
      names_to = "type",
      values_to = "has_type"
    ) |>
    filter(has_type) |>
    mutate(
      type = case_when(
        type == "data_cannot_share" ~ "Cannot Share",
        type == "data_on_request" ~ "Available on Request",
        type == "data_in_paper_andor_SI" ~ "Available in Paper and/or SI",
        type == "data_in_repository" ~ "Available in Repository"
      ),
      type = factor(type, levels = type_order)
    )
  
  # Calculate medians for annotation
  medians = plot_data |>
    group_by(type) |>
    summarise(median_cites = median(cited_by_count))
  
  # Create the plot
  ggplot(plot_data, aes(x = cited_by_count + 1, y = type, fill = type)) +
    ggridges::geom_density_ridges(
      scale = 0.9,
      alpha = 0.5,
      show.legend = FALSE
    ) +
    geom_point(
      data = medians,
      aes(x = median_cites + 1, y = type),
      shape = 18,
      size = 3,
      show.legend = FALSE
    ) +
    scale_x_log10(
      labels = scales::label_number(),
      breaks = c(1, 2, 5, 10, 20, 50, 100)
    ) +
    labs(
      x = "Citation Count (log scale)",
      y = "Data Availability Type",
      title = "Distribution of Citation Counts by Data Availability Type",
      caption = sprintf("Based on %d articles", nrow(df))
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
###############################################################################
# Generate figures and analysis ----------------------------------------


# Generates Figure 2: Data Availability Patterns in Qualitative PLOS Articles
data_availability_bar = visualize_data_availability(df)
ggsave("figures/Figure2_data_availability_percentage.png", data_availability_bar, dpi = 300,width = 10, unit = 'in')

# summarize_data_availability(df)

# percentages for Table 1: Comparing Studies of Data Availability Statements in PLOS Journals

cat("Percentage of data available in paper and/or SI: ") 
print(mean(df$data_in_paper_andor_SI) *100)

cat("Percentage of data explicitly in paper and SI: ") 
print(mean(df$data_in_paper_SI) *100)

cat("Percentage of data in paper only: ") 
print(mean(df$data_in_paper_alone) *100)

cat("Percentage of data in SI only: ") 
print(mean(df$data_in_SI_alone) *100)

cat("Percentage of data upon request: ") 
print(mean(df$data_on_request) *100)

cat("Percentage of data cannot share: ") 
print(mean(df$data_cannot_share) *100)

cat("Percentage of data in repository: ") 
print(mean(df$data_in_repository) *100)

cat("Percentage of data available combinations: ") 
print(mean(df$data_combination) *100)


cat("Percentage of data availability other: ") 
print(mean(df$data_other) *100)



# Generates Figure 3: Trends in Data Availability Statements over Time
# Availability yearly trends
available_byyear =visualize_availability_trends(df, by_month = FALSE)  # yearly
ggsave("figures/Figure3_available_byyear.png", available_byyear, dpi = 300, width = 8, unit = 'in')

# visualize_availability_trends(df, by_month = TRUE)   # monthly


# Run the analysis
# Checks consistency in data entry
check_request_consistency(df)

# Generates Figure 4: Sources to Which Data Requests Are Directed
# summarize_request_types(df)
request_types_bargraph = visualize_request_types(df)
ggsave("figures/Figure4_request_types_percentage.png", request_types_bargraph, dpi = 300, width = 10, unit = 'in')

# Generates Figure 5: Data Accessibility for Articles with DAS that Indicated Data Shared to Repository
data_available_subtypes = visualize_data_available_details(df)
ggsave("figures/Figure5_data_available_subtypes.png", data_available_subtypes)

# Generates Figure 6: Data Availability Proportions by Funding Status
# visualize_funding_availability(df)

available_byfunding = visualize_funding_proportions(df)
ggsave("figures/Figure6_available_byfunding.png", available_byfunding, dpi = 300, width = 10, height = 8.5, unit = 'in')


# Chi-squared test for difference between funded and unfunded projects (used in text) 
results <- df |>
  pivot_longer(
    c(data_on_request, data_in_paper_andor_SI, data_in_repository, data_cannot_share),
    names_to = "type",
    values_to = "value"
  ) |>
  group_by(type) |>
  summarise(
    p_value = {
      tbl <- table(funding_binary, value)
      
      chisq.test(tbl)$p.value
    }
  )
results

# Generates Figure 7: Data Availability Patterns by Data Type
available_bytype_heatmap = visualize_availability_patterns(df, "type_")
ggsave("figures/Figure7_available_bytype_heatmap.png", available_bytype_heatmap, dpi = 300, width = 7, unit = 'in')

# Appendix Figures

# Appendix figure S1: Data Availability by Country/Continent
available_country_continent = visualize_country_continent_availability(df)
ggsave("figures/Fig. S1_available_country_continent.png", available_country_continent, dpi = 300, width = 7, unit = 'in')

# Appendix figure S2: Data Availability by Method
available_bymethod_heatmap = visualize_availability_patterns(df, "method_")
ggsave("figures/Fig. S2_available_bymethod_heatmap.png", available_bymethod_heatmap, dpi = 300,  width = 7, unit = 'in')

# Appendix figure S3: Citations by Data Availability
citations_byavailable = visualize_citation_distributions(df)
ggsave("figures/Fig. S3_citations_byavailable.png", citations_byavailable, dpi = 300, width = 8, unit = 'in')

